#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(VennDiagram) ; library(venneuler)
library(knitr) ; library(expss)
library(biomaRt)
library(anRichment) ; library(BrainDiseaseCollection)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Gupta
load('./../../../Gupta/AllRegions/Data/preprocessed_data.RData')
DE_info_Gupta = DE_info %>% data.frame %>% mutate('ID' = rownames(datExpr), 'significant' = padj<0.05) %>%
dplyr::select(ID, log2FoldChange, padj, significant)
################################################################################
# Dataset with imputed values
load('./../Data/preprocessed_data_imputed.RData')
datExpr_imp = datExpr %>% data.frame
DE_info_imp = DE_info %>% data.frame
datMeta_imp = datMeta
# Update DE_info with Neuronal information
DE_info_imp = DE_info_imp %>% mutate('ID'=rownames(.)) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(significant=padj<0.05 & !is.na(padj))
################################################################################
# Original dataset
load('./../Data/preprocessed_data.RData')
datExpr_orig = datExpr %>% data.frame
DE_info_orig = DE_info %>% data.frame
datMeta_orig = datMeta
# Update DE_info with Neuronal information
DE_info_orig = DE_info_orig %>% mutate('ID'=rownames(.)) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(significant=padj<0.05 & !is.na(padj))
# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
rm(GO_annotations, datExpr, DE_info, datGenes, datMeta, dds, mart, getinfo)
The list of differentially expressed genes is very similar between the two preprocessing pipelines
Genes that lost their DE tag: As we saw in 20_03_11_z_score_outlier_methods.html, most of the outlier values in this dataset belonged to ASD samples, which could have created false positives in the DEA
Genes that gained their DE tag: By cleaning the outlier values, subtle differences may have been significant enough for the algorithm to capture now that were hidden by the outlier values before
all_genes = unique(c(rownames(datExpr_orig), rownames(datExpr_imp)))
genes_df = data.frame('Original' = all_genes %in% DE_info_orig$ID[DE_info_orig$significant],
'Imputed' = all_genes %in% DE_info_imp$ID[DE_info_imp$significant],
'Gupta' = all_genes %in% DE_info_Gupta$ID[DE_info_Gupta$significant])
rownames(genes_df) = all_genes
table_info = genes_df %>% apply_labels(Original = 'Original', Imputed = 'Imputed')
cro(table_info$Original, list(table_info$Imputed, total()))
|  Imputed |  #Total | |||
|---|---|---|---|---|
| Â FALSEÂ | Â TRUEÂ | Â | ||
|  Original | ||||
| Â Â Â FALSEÂ | 11391 | 257 | Â | 11648 |
| Â Â Â TRUEÂ | 251 | 4248 | Â | 4499 |
|    #Total cases | 11642 | 4505 |  | 16147 |
rm(table_info, all_genes)
grid.newpage()
grid.draw(draw.pairwise.venn(sum(genes_df$Original), sum(genes_df$Imputed), sum(genes_df$Original & genes_df$Imputed),
category = c('Original', 'Imputed'), fill = c('#e6b800', '#0099cc'),
fontfamily = rep('sans-serif',3), alpha = rep(0.25,2), lty = rep('blank', 2),
cat.fontfamily = rep('sans-serif',2)))
To see if the genes that are losing/gaining the label of DE have a true biological signal I’m going to look them up in Gupta’s list of DE genes.
Any match between datasets is considered to be evidence of the true DE of the gene, since it would be unlikely for a gene to be identified as DE in two different datasets because of unrelated technical problems.
Lost DE genes
It’s not necessarily bad to lose DE genes, it would only be a problem if these genes truly contained biological signals related to ASD.
cat(paste0(sum(genes_df$Gupta[genes_df$Original & !genes_df$Imputed]), '/', sum(genes_df$Original & !genes_df$Imputed),
' of the genes which are no longer DE were identified as DE in Gupta\'s dataset (',
round(100*mean(genes_df$Gupta[genes_df$Original & !genes_df$Imputed]),2),'%)'))
## 25/251 of the genes which are no longer DE were identified as DE in Gupta's dataset (9.96%)
Gained DE genes
Similarly, even though it would seem a positive thing to gain new DE genes, it’s only good when the genes contain real biological signal related to ASD.
cat(paste0(sum(genes_df$Gupta[!genes_df$Original & genes_df$Imputed]), '/', sum(!genes_df$Original & genes_df$Imputed),
' of the genes which are now DE are also considered to be DE in Gupta\'s dataset (',
round(100*mean(genes_df$Gupta[!genes_df$Original & genes_df$Imputed]),2),'%)'))
## 15/257 of the genes which are now DE are also considered to be DE in Gupta's dataset (5.84%)
The matches to Gupta’s dataset are quite small, so they may not be very robust, but it seems like we are losing more biological information than we are gaining from this new preprocessing pipeline!
I’m going to try to understand what happend to the gained/lost genes that are also in Gupta’s dataset to see if I can understand where the problem is
datExpr_raw = read.csv('./../Data/RNAseq_ASD_datExpr.csv', row.names=1)
datExpr_raw = datExpr_raw[rownames(datExpr_raw) %in% rownames(datExpr_imp), colnames(datExpr_raw) %in% colnames(datExpr_imp)]
z_scores = z_scores = datExpr_raw %>% apply(1, function(x) abs(x-mean(x))/sd(x)) %>% t %>% data.frame
max_z_scores = data.frame('ID' = rownames(datExpr_raw),
'max_z_score' = apply(z_scores, 1, max)) %>%
mutate('n_outliers' = apply(z_scores, 1, function(x) sum(x > mean(max_z_score) + 3*sd(max_z_score))))
outliers = which(max_z_scores$n_outliers>0)
max_z_scores$imputed_values = 0
datExpr = datExpr_raw
datMeta = datMeta_imp
for(i in outliers){
max_score = abs(max(abs(datExpr[i,]-mean(datExpr[i,]))/sd(datExpr[i,]))-mean(max_z_scores$max_z_score))/sd(max_z_scores$max_z_score)
while(max_score > 3){
sample = gsub('X','',names(which.max(abs(datExpr[i,]-mean(datExpr[i,]))/sd(datExpr[i,]))))
diagnosis = datMeta$Diagnosis[datMeta$Dissected_Sample_ID == sample] %>% as.character
columns = datMeta$Dissected_Sample_ID[datMeta$Diagnosis == diagnosis & datMeta$Dissected_Sample_ID != sample]
datExpr[i,paste0('X',sample)] = datExpr[i, paste0('X',columns)] %>% mean %>% round
max_z_scores$imputed_values[i] = max_z_scores$imputed_values[i] + 1
# Prepare for next round
max_score = abs(max(abs(datExpr[i,]-mean(datExpr[i,]))/sd(datExpr[i,]))-mean(max_z_scores$max_z_score))/sd(max_z_scores$max_z_score)
}
}
genes_df = genes_df %>% mutate('ID' = rownames(datExpr_imp)) %>% left_join(max_z_scores, by = 'ID')
rm(datExpr, datMeta, z_scores, i, outliers, max_score, sample, diagnosis, columns, datExpr_raw)
lost_genes = genes_df %>% filter(Original & !Imputed & Gupta) %>% left_join(gene_names, by = c('ID'='ensembl_gene_id'))
cat(paste0('Of the ', nrow(lost_genes), ' genes we lost from Gupta\'s DE genes, ',
sum(lost_genes$imputed_values>0), ' had had their max value imputed, which means that the other ',
sum(lost_genes$imputed_values==0), ' genes lost their DE tag because of the samples we identified as outliers and removed.'))
## Of the 25 genes we lost from Gupta's DE genes, 2 had had their max value imputed, which means that the other 23 genes lost their DE tag because of the samples we identified as outliers and removed.
plot_function = function(genes_list, i){
i = 3*i-2
last_j = 3
plot_list = list()
for(j in 1:last_j){
ID = genes_list$ID[i+j-1]
if(!is.na(ID)){
plot_data = data.frame('Sample' = colnames(datExpr_raw), 'expr' = datExpr_raw[ID,] %>% t, 'Diagnosis' = datMeta_orig$Diagnosis,
'removed_sample' = !colnames(datExpr_raw) %in% colnames(datExpr_imp)) %>%
mutate('shape' = ifelse(removed_sample, 4,16), Sample = as.character(Sample)) %>%
mutate('alpha' = ifelse(shape==16,0.6,1))
colnames(plot_data)[2] = 'expr'
# See if any value was imputed
datExpr_filtered = datExpr_raw[,colnames(datExpr_raw) %in% colnames(datExpr_imp)]
if(genes_list$imputed_values[i+j-1] > 0){
# Calculate imputed value
sample = gsub('X','',names(which.max(datExpr_filtered[ID,])))
diagnosis = datMeta_imp$Diagnosis[datMeta_imp$Dissected_Sample_ID == sample] %>% as.character
columns = datMeta_imp$Dissected_Sample_ID[datMeta_imp$Diagnosis == diagnosis & datMeta_imp$Dissected_Sample_ID != sample]
imp_value = datExpr_filtered[ID, paste0('X',columns)] %>% mean %>% round
# Add imputed value to plot_data
new_row = c(paste0('X',sample), imp_value, diagnosis, FALSE, 16)
plot_data = rbind(plot_data, new_row)
plot_data = plot_data %>% mutate(shape = as.numeric(ifelse(Sample == paste0('X',sample), 18, shape)),
expr = as.numeric(expr), Sample = as.factor(Sample),
alpha = ifelse(shape==16,0.6,1))
}
# Create plot
ggp = plot_data %>% ggplot(aes(Sample, expr+1, color=Diagnosis)) +
geom_hline(yintercept = median(plot_data$expr[plot_data$Diagnosis=='ASD'])+1, color='#00BFC4') +
geom_hline(yintercept = median(plot_data$expr[plot_data$Diagnosis=='CTL'])+1, color='#F8766D') +
geom_point(alpha = plot_data$alpha, shape = plot_data$shape, lwd = plot_data$alpha*2) +
theme_minimal() + scale_y_log10() +
theme(legend.position='none', axis.text.x=element_blank(), axis.ticks.x=element_blank())
if(sum(plot_data$removed_sample==TRUE)>0){
removed_samples = plot_data %>% dplyr::filter(removed_sample==TRUE) %>%
mutate(x = Sample, xend = Sample, y = expr, yend=median(plot_data$expr[plot_data$Diagnosis==Diagnosis]+1),
color = ifelse(Diagnosis=='ASD','#00BFC4','#F8766D'))
ggp = ggp + geom_segment(data = removed_samples, aes(x=Sample, y=expr, xend=Sample, yend = yend),
color = removed_samples$color, linetype = 'dotted')
}
if(genes_list$imputed_values[i+j-1] > 0) {
ggp = ggp + geom_segment(aes(x=paste0('X',sample), y=expr[Sample==paste0('X',sample)][1],
xend = paste0('X',sample), yend = expr[Sample==paste0('X',sample)][2]),
alpha = 0.5, color = ifelse(diagnosis=='ASD','#00BFC4','#F8766D'))
}
plot_list[[j]] = ggplotly(ggp)
} else last_j = j-1
}
annotations = list()
for(j in 1:last_j) {
annotations[[j]] = list(x = 0.1+1/last_j*(j-1) , y = 1.05, text = genes_list$Gene[i+j-1],
showarrow = F, xref='paper', yref='paper')
}
p = subplot(plot_list, nrows=1) %>% layout(annotations = annotations)
return(p)
}
Lost genes with imputed values
datExpr_raw = read.csv('./../Data/RNAseq_ASD_datExpr.csv', row.names=1)
datExpr_raw = datExpr_raw[rownames(datExpr_raw) %in% rownames(datExpr_imp), colnames(datExpr_raw) %in% colnames(datExpr_orig)]
genes_list = lost_genes %>% filter(imputed_values>0) %>% dplyr::rename('Gene' = external_gene_id)
plot_function(genes_list, 1)
rm(genes_list)
Lost genes without imputed values
These genes lost their DE tag because of the removed samples
Their adjusted p-value is still small but no longer below 0.05
genes_list = lost_genes %>% filter(imputed_values==0) %>%
left_join(DE_info_imp %>% dplyr::select(ID, log2FoldChange, padj), by = 'ID') %>%
dplyr::rename('Gene' = external_gene_id, 'LFC_now' = log2FoldChange, 'pval_now' = padj) %>%
left_join(DE_info_orig %>% dplyr::select(ID, log2FoldChange, padj), by = 'ID') %>%
dplyr::rename('LFC_before' = log2FoldChange, 'pval_before' = padj)
kable(genes_list %>% dplyr::select(Gene, LFC_before, LFC_now, pval_before, pval_now) %>% dplyr::arrange(desc(pval_now)))
| Gene | LFC_before | LFC_now | pval_before | pval_now |
|---|---|---|---|---|
| LAIR1 | 0.2407547 | 0.1936272 | 0.0355348 | 0.0951845 |
| DYSF | -0.1874271 | -0.1620711 | 0.0410731 | 0.0840487 |
| LGALS9 | 0.2627449 | 0.2329812 | 0.0330231 | 0.0811740 |
| CACNA1B | -0.0802029 | -0.0596707 | 0.0186024 | 0.0769917 |
| GPM6B | 0.0910740 | 0.0804009 | 0.0417699 | 0.0700452 |
| ORMDL2 | 0.1870408 | 0.1761004 | 0.0439827 | 0.0693773 |
| MCTP1 | -0.1367840 | -0.1325573 | 0.0494328 | 0.0687068 |
| ADCY9 | -0.0795976 | -0.0722156 | 0.0372166 | 0.0673555 |
| YPEL4 | -0.1097279 | -0.1005642 | 0.0417910 | 0.0624265 |
| C1QA | 0.4105865 | 0.3449752 | 0.0307228 | 0.0612419 |
| LASP1 | 0.0759806 | 0.0666278 | 0.0295123 | 0.0611956 |
| CACNA2D2 | -0.1737752 | -0.1479798 | 0.0286820 | 0.0585834 |
| SETD1A | -0.1160553 | -0.1000181 | 0.0241586 | 0.0585834 |
| WASF1 | -0.1152177 | -0.1127594 | 0.0408826 | 0.0580328 |
| KLK5 | 0.4114224 | 0.4038560 | 0.0472606 | 0.0574949 |
| ZMIZ2 | -0.1089273 | -0.0952768 | 0.0283087 | 0.0561784 |
| HIP1R | -0.0981093 | -0.0857949 | 0.0266818 | 0.0548443 |
| TMED5 | 0.1017375 | 0.0846044 | 0.0202803 | 0.0539410 |
| PRDX1 | 0.0968230 | 0.0854619 | 0.0210837 | 0.0531598 |
| VAMP5 | 0.2312376 | 0.2235125 | 0.0348144 | 0.0528330 |
| SBF1 | -0.0960837 | -0.0941124 | 0.0485784 | 0.0517666 |
| ASCC1 | 0.0868101 | 0.0819146 | 0.0423682 | 0.0514645 |
| EIF4E1B | -0.2054563 | -0.1823249 | 0.0269735 | 0.0512564 |
It looks like the value belonging to the removed samples contained helpful information
plot_function(genes_list, 1)
plot_function(genes_list, 2)
plot_function(genes_list, 3)
plot_function(genes_list, 4)
plot_function(genes_list, 5)
plot_function(genes_list, 6)
plot_function(genes_list, 7)
plot_function(genes_list, 8)
gained_genes = genes_df %>% filter(!Original & Imputed & Gupta) %>% left_join(gene_names, by = c('ID'='ensembl_gene_id'))
cat(paste0('Of the ', nrow(gained_genes), ' genes we gained from Gupta\'s DE genes, ',
sum(gained_genes$imputed_values>0), ' had had their max value imputed, which means that the other ',
sum(gained_genes$imputed_values==0), ' genes gained their DE tag because of the samples we identified as outliers and removed.'))
## Of the 15 genes we gained from Gupta's DE genes, 0 had had their max value imputed, which means that the other 15 genes gained their DE tag because of the samples we identified as outliers and removed.
Gained genes without imputed values
These genes gained their DE tag because of the removed samples
Their adjusted p-value is were small from the beginning but above 0.05
genes_list = gained_genes %>% filter(imputed_values==0) %>%
left_join(DE_info_imp %>% dplyr::select(ID, log2FoldChange, padj), by = 'ID') %>%
dplyr::rename('Gene' = external_gene_id, 'LFC_now' = log2FoldChange, 'pval_now' = padj) %>%
left_join(DE_info_orig %>% dplyr::select(ID, log2FoldChange, padj), by = 'ID') %>%
dplyr::rename('LFC_before' = log2FoldChange, 'pval_before' = padj)
kable(genes_list %>% dplyr::select(Gene, LFC_before, LFC_now, pval_before, pval_now) %>% dplyr::arrange(desc(pval_before)))
| Gene | LFC_before | LFC_now | pval_before | pval_now |
|---|---|---|---|---|
| VKORC1L1 | -0.0548949 | -0.0659234 | 0.1033499 | 0.0388468 |
| PPP6R1 | -0.0677603 | -0.0737994 | 0.0856979 | 0.0390671 |
| SLC30A4 | -0.0959336 | -0.1088998 | 0.0743135 | 0.0404671 |
| DDA1 | -0.0779904 | -0.0851168 | 0.0717262 | 0.0455727 |
| POLR2E | -0.0782448 | -0.0873556 | 0.0714477 | 0.0395353 |
| FBXW5 | -0.0860361 | -0.0928759 | 0.0694332 | 0.0463127 |
| MAL2 | -0.1251983 | -0.1405410 | 0.0648019 | 0.0402099 |
| ARL2 | -0.0995651 | -0.1063787 | 0.0576566 | 0.0487203 |
| SLC25A22 | -0.1155208 | -0.1193772 | 0.0553641 | 0.0445098 |
| FBXW7 | -0.1133259 | -0.1229711 | 0.0552993 | 0.0430954 |
| EEPD1 | -0.1259338 | -0.1321094 | 0.0541533 | 0.0379829 |
| BCL7B | -0.0711436 | -0.0847214 | 0.0530242 | 0.0192720 |
| USP42 | 0.0817366 | 0.0929551 | 0.0529046 | 0.0280962 |
| FBLL1 | -0.1243957 | -0.1323026 | 0.0528176 | 0.0369741 |
| SEC61A2 | -0.0968281 | -0.0965677 | 0.0526990 | 0.0496727 |
plot_function(genes_list, 1)
plot_function(genes_list, 2)
plot_function(genes_list, 3)
plot_function(genes_list, 4)
plot_function(genes_list, 5)